Data Cleaning

Author
Affiliation
Thomas Pollet and Connor Malcolm

Northumbria University

Published

January 30, 2022

Modified

May 16, 2023

Abstract
Data Cleaning readying for analyses

Data Cleaning

Scopus merge with Web of Science

Merging Scopus and Web of Science data.

All Open Access

common_col_all_oa_names <- intersect(names(scopus_all_oa), names(wos_all_oa)) #Find common column names
scopus_wos_all_oa <- merge(scopus_all_oa, wos_all_oa, by=common_col_all_oa_names, all.x=TRUE) # Merge data by common column names to check if scopus missed any publications wos has.

Green

common_col_green_names <- intersect(names(scopus_green), names(wos_green)) #Find common column names
scopus_wos_green <- merge(scopus_green, wos_green, by=common_col_green_names, all.x=TRUE) # Merge data by common column names to check if scopus missed any publications wos has.

Green Final

common_col_green_final_names <- intersect(names(scopus_green_final), names(wos_green_final)) #Find common column names
scopus_wos_green_final <- merge(scopus_green_final, wos_green_final, by=common_col_green_final_names, all.x=TRUE) # Merge data by common column names to check if scopus missed any publications wos has.

Green Accepted

common_col_green_accepted_names <- intersect(names(scopus_green_accepted), names(wos_green_accepted)) #Find common column names
scopus_wos_green_accepted <- merge(scopus_green_accepted, wos_green_accepted, by=common_col_green_accepted_names, all.x=TRUE) # Merge data by common column names to check if scopus missed any publications wos has.

Not Green

common_col_not_green_names <- intersect(names(scopus_not_green), names(wos_not_green)) #Find common column names
scopus_wos_not_green <- merge(scopus_not_green, wos_not_green, by=common_col_not_green_names, all.x=TRUE) # Merge data by common column names to check if scopus missed any publications wos has.

Just Corresponding authors

Cleaning to just corresponding authors

all_oa_corr <- scopus_wos_all_oa %>% dplyr::filter(grepl("@northumbria",RP, ignore.case = TRUE))
all_oa_single_authors<- all_oa_corr %>% 
    mutate(AU = strsplit(as.character(AU), ";")) %>%  
    unnest(AU) #Individual authors

green_corr <- scopus_wos_green %>% dplyr::filter(grepl("@northumbria",RP, ignore.case = TRUE))
green_single_authors<- green_corr %>% 
    mutate(AU = strsplit(as.character(AU), ";")) %>%  
    unnest(AU) #Individual authors

green_final_corr <- scopus_wos_green_final %>% dplyr::filter(grepl("@northumbria",RP, ignore.case = TRUE))
green_final_single_authors<- green_final_corr %>% 
    mutate(AU = strsplit(as.character(AU), ";")) %>%  
    unnest(AU) #Individual authors

green_accepted_corr <- scopus_wos_green_accepted %>% dplyr::filter(grepl("@northumbria",RP, ignore.case = TRUE))
green_accepted_single_authors<- green_accepted_corr %>% 
    mutate(AU = strsplit(as.character(AU), ";")) %>%  
    unnest(AU) #Individual authors

not_green_corr <- scopus_wos_not_green %>% dplyr::filter(grepl("@northumbria",RP, ignore.case = TRUE))
not_green_single_authors<- not_green_corr %>% 
    mutate(AU = strsplit(as.character(AU), ";")) %>%  
    unnest(AU) #Individual authors
all_oa_single_authors$RP <- sub(';.*', "" , as.character(all_oa_single_authors$RP))
all_oa_single_authors$RP <- gsub(',',"", as.character(all_oa_single_authors$RP))
all_oa_single_authors$RP <- gsub('\\.',"", as.character(all_oa_single_authors$RP))
#Removing problem characters fo filtering to just corresponding authors.
rp_single_author_all_oa <- subset(all_oa_single_authors, AU == RP)

green_single_authors$RP <- sub(';.*', "" , as.character(green_single_authors$RP))
green_single_authors$RP <- gsub(',',"", as.character(green_single_authors$RP))
green_single_authors$RP <- gsub('\\.',"", as.character(green_single_authors$RP))
#Removing problem characters fo filtering to just corresponding authors.
rp_single_author_green <- subset(green_single_authors, AU == RP)

green_final_single_authors$RP <- sub(';.*', "" , as.character(green_final_single_authors$RP))
green_final_single_authors$RP <- gsub(',',"", as.character(green_final_single_authors$RP))
green_final_single_authors$RP <- gsub('\\.',"", as.character(green_final_single_authors$RP))
#Removing problem characters fo filtering to just corresponding authors.
rp_single_author_green_final <- subset(green_final_single_authors, AU == RP)

green_accepted_single_authors$RP <- sub(';.*', "" , as.character(green_accepted_single_authors$RP))
green_accepted_single_authors$RP <- gsub(',',"", as.character(green_accepted_single_authors$RP))
green_accepted_single_authors$RP <- gsub('\\.',"", as.character(green_accepted_single_authors$RP))
#Removing problem characters fo filtering to just corresponding authors.
rp_single_author_green_accepted <- subset(green_accepted_single_authors, AU == RP)

not_green_single_authors$RP <- sub(';.*', "" , as.character(not_green_single_authors$RP))
not_green_single_authors$RP <- gsub(',',"", as.character(not_green_single_authors$RP))
not_green_single_authors$RP <- gsub('\\.',"", as.character(not_green_single_authors$RP))
#Removing problem characters fo filtering to just corresponding authors.
rp_single_author_not_green <- subset(not_green_single_authors, AU == RP)

Check for duplicates!

duplicates_green_green<-inner_join(scopus_wos_not_green, scopus_wos_green_final, scopus_wos_green_accepted, by='DI')

duplicates_not_green_green_final<-inner_join(scopus_wos_not_green, scopus_wos_green_final, scopus_wos_green, by='DI')

duplicates_not_green_green_acccepted<-inner_join(scopus_wos_not_green, scopus_wos_green_accepted, scopus_wos_green, by='DI')

duplicates_green_final_green_acccepted<-inner_join(scopus_green_final, scopus_wos_green_accepted, scopus_wos_green, by='DI')